In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.stem.snowball import SnowballStemmer
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from collections import defaultdict
import time
import pickle
import numpy as np
import boto3
%matplotlib inline
In [2]:
snowball = SnowballStemmer('english')
stopword_set = set(stopwords.words('english'))
In [3]:
df_reviews = pd.read_csv("s3://dogfaces/reviews/reviews.csv")
In [4]:
df_reviews.head()
Out[4]:
In [5]:
# show a word cloud
stopwords = set(STOPWORDS)
stopwords.add("dog")
stopwords.add("dogs")
stopwords.add("toy")
stopwords.add("love")
stopwords.add("loves")
stopwords.add("toys")
stopwords.add("one")
wc = WordCloud(background_color="white", max_words=2000, stopwords=stopwords)
cloud = wc.generate(' '.join(df_reviews['review_content'].sample(n=1000).values))
plt.figure(figsize=(10,8))
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.title('Wordcloud of Chewy Reviews', fontsize=20)
plt.show()
In [6]:
# source1: web
df_breed = pd.read_csv("breed_nick_names.txt",names=['breed_info'])
df_breed.head()
Out[6]:
In [7]:
df_breed.shape
Out[7]:
In [8]:
breeds_info = df_breed['breed_info'].values
breed_dict = {}
for breed in breeds_info:
temp = breed.lower()
temp = re.findall('\d.\s+(\D*)', temp)[0]
temp = temp.strip().split('=')
breed_dict[temp[0].strip()] = temp[1].strip()
In [9]:
# 1. different nicek names are separated with 'or'
for k, v in breed_dict.iteritems():
breed_dict[k] = map(lambda x:x.strip(), v.split(' or '))
In [10]:
# 2. get n-gram and stemmed words breed_dict
for k, v in breed_dict.iteritems():
breed_dict[k] = set(v)
breed_dict[k].add(k)
temp_set = set([snowball.stem(x) for x in breed_dict[k]])
breed_dict[k] = breed_dict[k]|temp_set
for word in word_tokenize(k):
breed_dict[k].add(word)
breed_dict[k].add(snowball.stem(word))
breed_dict[k] = breed_dict[k] - {'dog', 'dogs'} - stopword_set
In [11]:
print breed_dict['chow chows']
In [12]:
breed_lookup = defaultdict(set)
for k, v in breed_dict.iteritems():
for word in v:
breed_lookup[word].add(k)
breed_lookup.keys()
del_list = ['toy','blue','great','duck','coat','wire','st.','white','grey',
'black','old','smooth','west','soft']
for w in del_list:
breed_lookup.pop(w, None)
print len(breed_lookup)
In [13]:
# polish the look up tables based on 52 base classes
breed_classes = pd.read_csv("s3://dogfaces/tensor_model/output_labels_20170907.txt",names=['breed'])
base_breeds = breed_classes['breed'].values
not_found_breed = []
for breed in base_breeds:
if breed not in breed_dict:
if breed in breed_lookup:
if len(breed_lookup[breed])==1:
breed_in_dict = list(breed_lookup[breed])[0]
breed_dict[breed] = breed_dict[breed_in_dict]
breed_dict[breed].add(breed_in_dict)
breed_dict.pop(breed_in_dict, None)
print "replace the key {} with {}".format(breed_in_dict, breed)
else:
print breed, breed_lookup[breed]
elif snowball.stem(breed) in breed_lookup:
breed_stem = snowball.stem(breed)
if len(breed_lookup[breed_stem])==1:
breed_in_dict = list(breed_lookup[breed_stem])[0]
breed_dict[breed] = breed_dict[breed_in_dict]
breed_dict[breed].add(breed_in_dict)
breed_dict.pop(breed_in_dict, None)
else:
print breed,breed_stem, breed_lookup[breed_stem]
else:
not_found_breed.append(breed)
print "not found these breeds:"
print not_found_breed
In [14]:
# poodles:
for breed in not_found_breed:
if breed.endswith('poodle') or breed=='wheaten terrier':
breed_dict[breed] = set(breed.split())|set([snowball.stem(w) for w in breed.split()])
breed_dict.pop('poodle', None)
In [15]:
# bullmastiff
if 'bull mastiff' in not_found_breed:
breed_dict['bull mastiff'] = breed_dict['bullmastiffs']
breed_dict.pop('bullmastiffs', None)
In [16]:
# english springer
if 'english springer' in not_found_breed:
breed_dict['english springer'] = breed_dict['english springer spaniels']
breed_dict.pop('english springer spaniels', None)
In [17]:
# german short haired, german shepherd and 'american bulldog'
name = 'american bulldog'
if name in not_found_breed:
breed_dict[name] = breed_dict['bulldog'] | set(name.split()) | set([snowball.stem(w) for w in name.split()])
breed_dict.pop('bulldog', None)
name = 'german shorthaired'
if name in not_found_breed:
breed_dict[name] = breed_dict['german shorthaired pointers']
breed_dict.pop('german shorthaired pointers', None)
name = 'german shepherd'
if name in not_found_breed:
breed_dict[name] = breed_dict['german shepherd dog']
breed_dict.pop('german shepherd dog', None)
In [18]:
# basset dog
breed_dict['basset'] = breed_dict['basset hound']|breed_dict['petits bassets griffons vendeens']
In [19]:
'basset' in base_breeds
Out[19]:
In [59]:
sorted(breed_dict.keys())
Out[59]:
In [31]:
ind = np.random.randint(df_reviews.shape[0])
text_review = df_reviews['review_content'][ind].lower()
print text_review
puncs = string.punctuation
reduced_set = set([snowball.stem(x) for x in (set(filter(lambda x: x not in puncs, word_tokenize(text_review)))
- stopword_set)])
po_breeds = []
for w in reduced_set:
if w in breed_lookup:
po_breeds.extend(breed_lookup[w])
print po_breeds
In [68]:
df_reviews.columns
Out[68]:
In [32]:
def getReviewBreed(text):
ntext = text.decode('utf-8')
reduced_set = set([snowball.stem(x) for x in
(set(filter(lambda x: x not in string.punctuation,
word_tokenize(ntext.lower()))) - stopword_set)])
po_breeds = []
for w in reduced_set:
if w in breed_lookup:
po_breeds.extend(breed_lookup[w])
return po_breeds
def getBreedTable(df):
N = df.shape[0]
breed = []
review_id = []
toy_id = []
for ind, row in df.iterrows():
breed.append(getReviewBreed(row['review_content']))
review_id.append(row['review_id'])
toy_id.append(row['toy_id'])
return pd.DataFrame({'review_id':review_id, 'toy_id':toy_id, 'breed_extract':breed})
In [33]:
test_df = df_reviews.copy()
start_time = time.time()
new_df = getBreedTable(test_df)
print time.time() - start_time
In [34]:
new_df.head()
Out[34]:
In [35]:
df_reviews['review_content'][1]
Out[35]:
In [36]:
new_df.shape
Out[36]:
In [37]:
df_extract = pd.merge(df_reviews, new_df, on=['review_id', 'toy_id'])
df_extract.pop('review_content')
print df_extract.shape
df_extract.head()
Out[37]:
In [65]:
#ind = np.random.randint(df_extract.shape[0])
ind = 4
print df_reviews['review_content'][ind]
print df_extract['breed_extract'][ind]
In [83]:
df_extract['breed_extract'] = df_extract['breed_extract'].apply(lambda row:','.join(row))
In [84]:
df_extract.head()
Out[84]:
In [85]:
np.sum(df_extract['breed_extract'].isnull())
Out[85]:
In [78]:
breed_lookup['poodle']
Out[78]:
In [86]:
save_data = df_extract.to_csv(index=False)
s3_res = boto3.resource('s3')
s3_res.Bucket('dogfaces').put_object(Key='reviews/extract_breed_review.csv', Body=save_data)
Out[86]:
In [66]:
# save breed_lookup
# save breed_dict
with open('breed_lookup.pickle', 'wb') as handle:
pickle.dump(breed_lookup, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('breed_dict.pickle', 'wb') as handle:
pickle.dump(breed_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
In [7]:
# source 2: classified dog names
breed_classes = pd.read_csv("s3://dogfaces/tensor_model/output_labels_20170907.txt",names=['breed'])
breed_classes.head()
Out[7]:
In [81]:
# generate a data frame, review_id, toy_id, breed
len(df_extract['review_id'].unique())
Out[81]:
In [ ]: